3 General statistics

3.1 Sample statistics

3.1.1 All

read_tsv("data/sample.tsv") %>%
  summarise(specimens=n_distinct(specimen_id),
            species=n_distinct(specimen_species),
            orders=n_distinct(specimen_order),
            families=n_distinct(specimen_family)) %>% 
  tt()
tinytable_mdq3zupe5lyb1z8kdldy
specimens species orders families
4364 244 22 67

3.1.2 Subset

Faecal and cloacal swab samples employed in the study.

inner_join(read_tsv("data/sample.tsv"),
          read_tsv("data/extraction.tsv"),
          by="sample_id") %>% 
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  summarise(specimens=n_distinct(specimen_id),
            species=n_distinct(specimen_species),
            orders=n_distinct(specimen_order),
            families=n_distinct(specimen_family),
            swabs=n_distinct(sample_id[sample_type == "Anal/cloacal swab"]),
            faeces=n_distinct(sample_id[sample_type == "Faecal"])) %>% 
  tt()
tinytable_rmdnh03roco4rxxl0vuk
specimens species orders families swabs faeces
2025 151 17 54 442 1824

3.1.3 Origin of samples (Figure S1)

read_tsv("data/sample.tsv") %>%
  #subset columns
  select(
    sample_id,
    specimen_species,
    specimen_order,
    specimen_class,
    capture_latitude,
    capture_longitude
  ) %>%
  #Add jitter to points
  mutate(
    capture_latitude_jitter=capture_latitude+rnorm(length(capture_latitude), mean=0, sd=0.5),
    capture_longitude_jitter=capture_longitude+rnorm(length(capture_longitude), mean=0, sd=0.5),
  ) %>%
  #Plot map  
  ggplot(.) +
    geom_map(
      data=map_data("world"),
      map = map_data("world"),
      aes(long, lat, map_id=region),
      color = "white", fill = "#cccccc", size = 0.2
    ) +
    geom_point(
      aes(x=capture_longitude_jitter,y=capture_latitude_jitter, color=specimen_order),
      alpha=0.5, size=0.5, shape=16) +
    labs(color="Taxonomic order") +
    theme_minimal() +
    theme(
      axis.title.x=element_blank(),
      axis.title.y=element_blank(),
      legend.position = "bottom")

3.2 Data statistics

3.2.1 Total data

left_join(read_tsv("data/preprocessing.tsv"),
          read_tsv("data/sample.tsv"),
          by="sample_id") %>%
  filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
  mutate(bases_pre_fastp = bases_pre_fastp / 1000000000)  %>% #convert bases to gigabases (GB)
  summarise(
    total= sum(bases_pre_fastp, na.rm = TRUE),
    swabs=sum(bases_pre_fastp[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
    faeces=sum(bases_pre_fastp[sample_type == "Faecal"], na.rm = TRUE),
    swabs_n=n_distinct(preprocessing_id[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
    faeces_n=n_distinct(preprocessing_id[sample_type == "Faecal"], na.rm = TRUE),
    mean= mean(bases_pre_fastp, na.rm = TRUE),
    sd = sd(bases_pre_fastp, na.rm = TRUE)
  ) %>%
  tt()
tinytable_2q1toyomc8lu0jjl9c39
total swabs faeces swabs_n faeces_n mean sd
11262.32 1998.166 9264.149 345 1702 5.561637 4.804119

3.2.2 Quality-filtered data

read_tsv("data/preprocessing.tsv") %>%
  mutate(bases_post_fastp = bases_post_fastp / 1000000000)  %>% #convert bases to gigabases (GB)
  summarise(
    total= sum(bases_post_fastp, na.rm = TRUE),
    mean= mean(bases_post_fastp, na.rm = TRUE),
    sd = sd(bases_post_fastp, na.rm = TRUE)
  ) %>%
  tt()
tinytable_prjhtmcszqqk9zg42rnh
total mean sd
13486.81 5.377518 4.586315

3.2.3 Host genomic data

read_tsv("data/preprocessing.tsv") %>%
  mutate(host_bases = host_bases / 1000000000)  %>% #convert bases to gigabases (GB)
  summarise(
    total= sum(host_bases, na.rm = TRUE),
    mean= mean(host_bases, na.rm = TRUE),
    sd = sd(host_bases, na.rm = TRUE)
  ) %>%
  tt()
tinytable_a1wqt8leaac5h4km1f1i
total mean sd
5554.962 2.19477 3.711089

3.2.4 Metagenomic data

read_tsv("data/preprocessing.tsv") %>%
  mutate(metagenomic_bases = metagenomic_bases / 1000000000)  %>% #convert bases to gigabases (GB)
  summarise(
    total= sum(metagenomic_bases, na.rm = TRUE),
    mean= mean(metagenomic_bases, na.rm = TRUE),
    sd = sd(metagenomic_bases, na.rm = TRUE)
  ) %>%
  tt()
tinytable_ege24snb99lh5v58y5ru
total mean sd
7931.853 3.133881 3.272415

3.2.5 Assemblies

left_join(read_tsv("data/assembly.tsv"),
          read_tsv("data/preprocessing.tsv"),
          by="preprocessing_id") %>%
  left_join(read_tsv("data/sample.tsv"),by="sample_id") %>% 
  group_by(assembly_type) %>% 
  summarise(assembly_n=n_distinct(assembly_id),
            swabs_n=n_distinct(assembly_id[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
    faeces_n=n_distinct(assembly_id[sample_type == "Faecal"], na.rm = TRUE),
    swabs_size=sum(assembly_length[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
    faeces_size=sum(assembly_length[sample_type == "Faecal"], na.rm = TRUE)) %>%
  tt()
tinytable_6mne2c7hz4qhb4tpwevt
assembly_type assembly_n swabs_n faeces_n swabs_size faeces_size
Coassembly 294 41 227 20029149172 916263568901
Individual 1722 177 1377 2337823366 132226412145
Multisplit 1 0 1 0 0
NA 1 0 1 0 0

3.2.6 MAGs

left_join(read_tsv("data/mag.tsv"),
          read_tsv("data/assembly.tsv"),
          by="assembly_id") %>%
    left_join(read_tsv("data/preprocessing.tsv"),by="preprocessing_id") %>% 
    left_join(read_tsv("data/sample.tsv"),by="sample_id") %>% 
    select(mag_id,mag_phylum,mag_completeness,mag_contamination,sample_type)%>% 
    unique() %>% 
    summarise(number=n(),
            swabs_n=n_distinct(mag_id[sample_type == "Anal/cloacal swab"], na.rm = TRUE),
            faeces_n=n_distinct(mag_id[sample_type == "Faecal"], na.rm = TRUE),
            phylums=n_distinct(mag_phylum),
            completeness=mean(mag_completeness),
            contamination=mean(mag_contamination)) %>% 
  tt()
tinytable_1zfyy4ufe2lx5dsujjuc
number swabs_n faeces_n phylums completeness contamination
51690 1896 47757 42 83.52126 2.000393